{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from text_dedup.utils import UnionFind\n",
    "# from text_dedup.utils import RankUnionFind\n",
    "import os"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Prepare test file from https://snap.stanford.edu/data/soc-Pokec.html\n",
    "TEST_DATA_DIR = os.path.abspath(\"../data\")\n",
    "os.makedirs(TEST_DATA_DIR, exist_ok=True)\n",
    "TEST_DATA_PATH = os.path.join(TEST_DATA_DIR, \"soc-pokec-relationships.txt\")\n",
    "\n",
    "# from https://snap.stanford.edu/data/soc-Pokec.html\n",
    "# 126MiB compressed, ~400MiB decompressed\n",
    "wget_command = f'wget https://snap.stanford.edu/data/soc-pokec-relationships.txt.gz -O \"{TEST_DATA_PATH}.gz\"'"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# download text.gz\n",
    "!{wget_command}\n",
    "# gunzip the text file\n",
    "!pigz -d {TEST_DATA_PATH}.gz\n"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def process_data(path, class_=UnionFind):\n",
    "    uf = class_()\n",
    "    with open(path) as f:\n",
    "        for line in f:\n",
    "            x, y = line.strip().split('\\t')\n",
    "            x = int(x)\n",
    "            y = int(y)\n",
    "            uf.union(x, y)\n",
    "    return uf"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "%%timeit -n 1 -r 20\n",
    "uf = process_data(TEST_DATA_PATH, class_=UnionFind)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# union by rank completely supplants unionfind\n",
    "#%%timeit -n 1 -r 20\n",
    "#ruf = process_data(TEST_DATA_PATH, class_=RankUnionFind)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# rough estimation of the memory usage\n",
    "import pickle\n",
    "uf = process_data(TEST_DATA_PATH, class_=UnionFind)\n",
    "#ruf = process_data(TEST_DATA_PATH, class_=RankUnionFind)\n",
    "\n",
    "print(len(pickle.dumps(uf)))\n",
    "#print(len(pickle.dumps(ruf)))"
   ],
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
